import pandas as pd
from tqdm import tqdm
import os

zhenzhuangs = []
triples = []
with open("../医学知识/entity2id.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        if len(line.split()) > 1:
            # print(line.split()[0], line.split()[1])
            zhenzhuangs.append([line.split()[0], line.split()[1]])
    f.close()
# print(zhenzhuangs)


dirs = os.listdir("./病历")
# print(dirs)
tab = 145
for dir in dirs:
    df = pd.read_excel('./病历/' + dir)
    # print(df)
    pbar = tqdm(df.iterrows())
    for index, row in pbar:
        # 症状三元组
        # print(index)
        # print(row['症状'])
        # print(row['症状'].split(','))
        # 去重
        zzl = list(set(row['症状'].split(',')))
        # print(len(row['症状'].split(',')))
        for e in zzl:
            # print(e)
            for zhenzhuang in zhenzhuangs:
                if zhenzhuang[0] == e:
                    # print([str(tab+index), zhenzhuang[1], '1'])
                    triples.append([str(tab), zhenzhuang[1], '1'])
        # 疾病三元组
        # print(row['诊断'])
        for zhenzhuang in zhenzhuangs:
            if zhenzhuang[0] == row['诊断'].split(',')[0]:
                # print([str(145+index), zhenzhuang[1], '2'])
                triples.append([str(tab), zhenzhuang[1], '2'])
        pbar.set_description(dir + ": 进度 %d/%d" % (index, len(df)))
        tab += 1
print(len(triples))

# 储存病历三元组

f = open("triple.txt", "w", encoding="utf-8")
f.write(str(len(triples))+"\n")
for triple in triples:
    f.write(triple[0] + "\t" + triple[1] + "\t" + triple[2] + "\n")

'''

# 储存包含疾病-症状的三元组
f = open("train2id.txt", "a", encoding="utf-8")
f.write(str(len(triples))+"\n")
for triple in triples:
    f.write(triple[0] + "\t" + triple[1] + "\t" + triple[2] + "\n")
'''